Classifying a movie plot into genres was chosen as it provides a wide range of exploratory paths with data science methods and its application can be found in various sophisticated recommendation engines. The project aims at exploring various classifier algorithms, understanding their behaviors and enhancing the classifier accuracy to predict the genre.
Our goal with the project is to:
Plot summary descriptions scraped from Wikipedia
The dataset contains descriptions of 34,886 movies from around the world. Column descriptions are listed below:
In this section we will try to conduct data exploration for learn about the feature that the data set has to offer.
#!pip install textblob
#!pip install scikit-plot
#!pip install pandas-profiling
#!pip install nltk
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pandas_profiling
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import metrics
import pickle
df = pd.read_csv('data/wiki_movie_plots_deduped.csv')
df.head()
df.shape
df.describe()
pandas_profiling.ProfileReport(df)
print(df['Genre'].count())
From the above profiling we have seen that the column in genre is highly irregular, there are spelling fix and genre correction which needs to be done
movies = df
movies['GenreCorrected'] =movies['Genre']
movies['GenreCorrected']=movies['GenreCorrected'].str.strip()
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' - ', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' / ', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('/', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' & ', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(', ', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('; ', '|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-pic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biopic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographical', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biodrama', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-drama', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographic', 'biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(film genre\)', '')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animated','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anime','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('children\'s','children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedey','comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[not in citation given\]','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' set 4,000 years ago in the canadian arctic','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historical','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romantic','romance')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3-d','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3d','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('viacom 18 motion pictures','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci-fi','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ttriller','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('.','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('based on radio serial','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' on the early years of hitler','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci fi','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science fiction','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (30min)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('16 mm film','short')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[140\]','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[144\]','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' for ','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventures','adventure')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung-fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial arts','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war ii','war')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war i','war')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography about montreal canadiens star|maurice richard','biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bholenath movies|cinekorn entertainment','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(volleyball\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy film','spy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anthology film','anthology')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography fim','biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('avant-garde','avant_garde')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biker film','biker')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('buddy cop','buddy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('buddy film','buddy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy 2-reeler','comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('films','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('film','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography of pioneering american photographer eadweard muybridge','biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('british-german co-production','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bruceploitation','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel','comedy-drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies by the mob\|knkspl','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movie','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming of age','coming_of_age')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming-of-age','coming_of_age')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama about child soldiers','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( based).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( co-produced).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( adapted).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( about).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musical b','musical')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationchildren','animation|children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' period','period')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama loosely','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("yogesh dattatraya gosavi's directorial debut \[9\]",'')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("war-time","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wartime","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("ww1","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('unknown','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wwii","war")
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psychological','psycho')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom-coms','romance')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('true crime','crime')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|007','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('slice of life','slice_of_life')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computer animation','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gun fu','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('j-horror','horror')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(shogi|chess\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('afghan war drama','war drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|6 separate stories','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(30min\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (road bicycle racing)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' v-cinema','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv miniseries','tv_miniseries')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|docudrama','\|documentary|drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' in animation','|animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptation).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptated).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adapted).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( on ).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('american football','sports')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dev\|nusrat jahan','sports')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('television miniseries','tv_miniseries')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(artistic\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \|direct-to-dvd','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('history dram','history drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial art','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psycho thriller,','psycho thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|1 girl\|3 suitors','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(road bicycle racing\)','')
filterE = movies['GenreCorrected']=="ero"
movies.loc[filterE,'GenreCorrected']="adult"
filterE = movies['GenreCorrected']=="music"
movies.loc[filterE,'GenreCorrected']="musical"
filterE = movies['GenreCorrected']=="-"
movies.loc[filterE,'GenreCorrected']=''
filterE = movies['GenreCorrected']=="comedy–drama"
movies.loc[filterE,'GenreCorrected'] = "comedy|drama"
filterE = movies['GenreCorrected']=="comedy–horror"
movies.loc[filterE,'GenreCorrected'] = "comedy|horror"
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' ','|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace(',','|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('-','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionadventure','action|adventure')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actioncomedy','action|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actiondrama','action|drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionlove','action|love')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionmasala','action|masala')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionchildren','action|children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasychildren\|','fantasy|children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasycomedy','fantasy|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasyperiod','fantasy|period')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('cbctv_miniseries','tv_miniseries')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedy','drama|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedysocial','drama|comedy|social')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedydrama','comedy|drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedyhorror','comedy|horror')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sciencefiction','science_fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventurecomedy','adventure|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationdrama','animation|drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\|','|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('muslim','religious')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('thriler','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('crimethriller','crime|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantay','fantasy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionthriller','action|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedysocial','comedy|social')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martialarts','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(children\|poker\|karuta\)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('epichistory','epic|history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotica','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotic','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((\|produced\|).+)','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('chanbara','chambara')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedythriller','comedy|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('colour\|yellow\|productions\|eros\|international','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|directtodvd','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('liveaction','live|action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('melodrama','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroes','superheroe')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gangsterthriller','gangster|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heistcomedy','comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heist','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historic','history')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historydisaster','history|disaster')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('warcomedy','war|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('westerncomedy','western|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ancientcostume','costume')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computeranimation','animation')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramatic','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramedy','drama|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramaa','drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('famil\|','family')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroe','superhero')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biogtaphy','biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('devotionalbiography','devotional|biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('docufiction','documentary|fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familydrama','family|drama')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('espionage','spy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('supeheroes','superhero')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancefiction','romance|fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horrorthriller','horror|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspensethriller','suspense|thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musicaliography','musical|biography')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('triller','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(fiction\)','|fiction')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanceaction','romance|action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancecomedy','romance|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancehorror','romance|horror')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romcom','romance|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom\|com','romance|comedy')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('satirical','satire')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fictionchildren','science_fiction|children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('homosexual','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sexual','adult')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mockumentary','documentary')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('periodic','period')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanctic','romantic')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('politics','political')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('samurai','martial_arts')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv_miniseries','series')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('serial','series')
filterE = movies['GenreCorrected']=="musical–comedy"
movies.loc[filterE,'GenreCorrected'] = "musical|comedy"
filterE = movies['GenreCorrected']=="roman|porno"
movies.loc[filterE,'GenreCorrected'] = "adult"
filterE = movies['GenreCorrected']=="action—masala"
movies.loc[filterE,'GenreCorrected'] = "action|masala"
filterE = movies['GenreCorrected']=="horror–thriller"
movies.loc[filterE,'GenreCorrected'] = "horror|thriller"
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('family','children')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial_arts','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horror','thriller')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('war','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventure','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fiction','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('noir','black')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superhero','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('social','')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspense','action')
filterE = movies['GenreCorrected']=="drama|romance|adult|children"
movies.loc[filterE,'GenreCorrected'] = "drama|romance|adult"
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|–\|','|')
movies['GenreCorrected']=movies['GenreCorrected'].str.strip(to_strip='\|')
movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionner','action')
movies['GenreCorrected']=movies['GenreCorrected'].str.strip()
movies['Count']=1
movies[['GenreCorrected','Count']].groupby(['GenreCorrected'],as_index=False).count().sort_values(['Count'], ascending=False).head(10)
movies['GenreSplit']=movies['GenreCorrected'].str.split('|')
movies['GenreSplit']= movies['GenreSplit'].apply(np.sort).apply(np.unique)
genres_array = np.array([])
for i in range(0,movies.shape[0]-1):
genres_array = np.concatenate((genres_array, movies['GenreSplit'][i] ))
genres_array
genres = pd.DataFrame({'Genre':genres_array})
genres['Count']=1
genres[['Genre','Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False).head(10)
genres=genres[['Genre','Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False)
genres = genres[genres['Genre']!='']
genres.head(25)
TotalCountGenres=sum(genres['Count'])
TotalCountGenres
We will be selecting the top genre from the list of oberserved genre whose cummulative frequency is above 95%
genres['Frequency'] = genres['Count']/TotalCountGenres
genres['CumulativeFrequency'] = genres['Frequency'].cumsum()
genres.head(20)
Selecting CF <=.957
np.array(genres[genres['CumulativeFrequency']<=.957]['Genre'])
genres[genres['CumulativeFrequency']<=.957][['Genre','Count']].plot(x='Genre', y='Count', kind='bar', legend=False, grid=True, figsize=(8, 5))
plt.title("Number of movies per genre")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Movie genres', fontsize=12)
plt.show()
mainGenres=np.array(genres[genres['CumulativeFrequency']<=.957]['Genre'])
arr1=np.array(['adult', 'romance', 'drama','and'])
arr1[np.in1d(arr1,mainGenres)] # genres not in the mainGenres array will be deleted
movies['GenreSplit'][10:12].apply(lambda x: x[np.in1d(x,mainGenres)])
movies['GenreSplitMain'] = movies['GenreSplit'].apply(lambda x: x[np.in1d(x,mainGenres)])
movies[['GenreSplitMain','GenreSplit','Genre']][200:220]
movies['MainGenresCount'] = movies['GenreSplitMain'].apply(len)
max(movies['MainGenresCount'] )
movies['MainGenresCount'].hist()
count_vec = CountVectorizer(stop_words="english", analyzer='word',
ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
movies.GenreSplitMain[6:15].apply(lambda x: '-'.join(x)).str.split(pat='-',n=5,expand=True)
movies.GenreSplitMain[6:15].apply(lambda x: '-'.join(x)).str.get_dummies(sep='-')
movies.GenreSplitMain[6:15]
movies.columns
movies.shape
# the title is not unique
len(movies.Title.unique())
# The number of movies not having a genre
movies[movies.GenreCorrected==''].shape
Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.
movies['Plot'].apply(lambda x : str.lower(x))[:5]
movies['Plot_correction'] = movies['Plot'].apply(lambda x : str.lower(x))
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
import nltk
dir(nltk)[:10]
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
Writing a function which can clean text for all the plots in the dataframe
def clean_text(text):
text = "".join([word for word in text if word not in string.punctuation])
tokens = re.split('\W+', text)
text = [word for word in tokens if word not in stopwords]
return text
movies['Plot_correction'] = movies['Plot_correction'].apply(lambda x: clean_text(x))
movies['Plot_correction']
After conducting research we found and tried out both stemming and lemmatizing techniques. We finally decided to use lemmatizing for the final plot wors correction as it gives better results
def lemmatizing(tokenized_text):
text = [wn.lemmatize(word) for word in tokenized_text]
return text
movies['Plot_correction'] = movies['Plot_correction'].apply(lambda x: lemmatizing(x))
movies['Plot_correction'] = movies['Plot_correction'].apply(lambda x: " ".join(str(k) for k in x))
movies['Plot_correction']
Steps to be done:
movies['GenreSplit']
mainGenres
movies['GenreSplit'] = movies['GenreSplit'].apply(lambda x: x if x in mainGenres else None)
movies['GenreSplit']
movies.dropna(subset=['GenreSplit'], inplace=True)
movies['GenreSplit']
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies['GenreSplit'])
# transform target variable
y = multilabel_binarizer.transform(movies['GenreSplit'])
multilabel_binarizer.classes_
pickle.dump(multilabel_binarizer, open('helper/multilabel_binarizer.pkl', 'wb'))
tfidf_vectorizer = TfidfVectorizer(stop_words ='english', smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
xtrain, xval, ytrain, yval = train_test_split(movies['Plot_correction'], y, test_size=0.2, random_state=9)
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)
pickle.dump(tfidf_vectorizer, open('helper/tfidf_vectorizer.pkl', 'wb'))
Writing helper functions to plot and view metrics for each genre
This function will plot a heat map of accuracy_score, precision, recall, fscore and roc_auc for a given classifier
def plot_metrics(title, clf, yval, y_pred, xval_tfidf):
df = {}
for i,k in enumerate(multilabel_binarizer.classes_):
yval_item = [item[i] for item in yval]
y_pred_item = [item[i] for item in y_pred]
met = metrics.precision_recall_fscore_support(yval_item, y_pred_item, average="micro")
df[k]={}
df[k]['accuracy_score'] = metrics.accuracy_score(yval_item, y_pred_item)
df[k]['precision'] = met[0]
df[k]['recall'] = met[1]
df[k]['fscore'] = met[2]
try:
y_pred_dec = clf.decision_function(xval_tfidf)
decision_val = [item[i] for item in y_pred_dec]
df[k]['roc_auc'] = metrics.roc_auc_score(yval_item,decision_val,average="micro")
except:
try:
y_pred_dec = clf.predict_proba(xval_tfidf)
decision_val = [item[i] for item in y_pred_dec]
df[k]['roc_auc'] = metrics.roc_auc_score(yval_item,decision_val,average="micro")
except:
try:
y_pred_dec = clf.predict_proba(xval_tfidf)
decision_val = [item[1] for item in y_pred_dec[i]]
df[k]['roc_auc'] = metrics.roc_auc_score(yval_item,decision_val,average="micro")
except:
df[k]['roc_auc'] = None
df2 = pd.DataFrame.from_dict(df, orient='index')
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df2, annot=True, cmap="YlGnBu", ax = ax)
ax.set_title(title)
plt.show()
This funcion will return accuracy_score, precision, recall, fscore and roc_auc for a given classifier as a dataframe
def metrics_df(title,clf, yval, y_pred, xval_tfidf):
df = {}
for i,k in enumerate(multilabel_binarizer.classes_):
yval_item = [item[i] for item in yval]
y_pred_item = [item[i] for item in y_pred]
met = metrics.precision_recall_fscore_support(yval_item, y_pred_item, average="micro")
df[k]={}
df[k]['accuracy_score'] = metrics.accuracy_score(yval_item, y_pred_item)
df[k]['precision'] = met[0]
df[k]['recall'] = met[1]
df[k]['fscore'] = met[2]
try:
y_pred_dec = clf.decision_function(xval_tfidf)
decision_val = [item[i] for item in y_pred_dec]
df[k]['roc_auc'] = metrics.roc_auc_score(yval_item,decision_val,average="micro")
except:
try:
y_pred_dec = clf.predict_proba(xval_tfidf)
decision_val = [item[i] for item in y_pred_dec]
df[k]['roc_auc'] = metrics.roc_auc_score(yval_item,decision_val,average="micro")
except:
try:
y_pred_dec = clf.predict_proba(xval_tfidf)
decision_val = [item[1] for item in y_pred_dec[i]]
df[k]['roc_auc'] = metrics.roc_auc_score(yval_item,decision_val,average="micro")
except:
df[k]['roc_auc'] = None
df2 = pd.DataFrame.from_dict(df, orient='index')
df2.reset_index(level=0, inplace=True)
df2.rename(columns={"index": "genre"}, inplace = True)
df2['classifier'] = title
return df2
This function plots the ROC curve for each of the genre for a given classifier
def plot_roc_curve(title,clf, yval, y_pred, xval_tfidf):
for i,k in enumerate(multilabel_binarizer.classes_):
yval_item = [item[i] for item in yval]
y_pred_item = [item[i] for item in y_pred]
met = metrics.precision_recall_fscore_support(yval_item, y_pred_item, average="micro")
try:
y_pred_dec = clf.decision_function(xval_tfidf)
decision_val = [item[i] for item in y_pred_dec]
fpr, tpr, thresholds = metrics.roc_curve(yval_item, decision_val)
auc_value = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, label=k + ' - auc (area = %0.2f)' % auc_value)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.legend(loc="best",bbox_to_anchor=(1, 1))
plt.title('ROC curve for '+ title +' classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
except:
y_pred_dec = clf.predict_proba(xval_tfidf)
try:
decision_val = [item[i] for item in y_pred_dec]
fpr, tpr, thresholds = metrics.roc_curve(yval_item, decision_val)
auc_value = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, label=k + ' - auc (area = %0.2f)' % auc_value)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.legend(loc="best",bbox_to_anchor=(1, 1))
plt.title('ROC curve for '+ title +' classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
except:
decision_val = [item[1] for item in y_pred_dec[i]]
fpr, tpr, thresholds = metrics.roc_curve(yval_item, decision_val)
auc_value = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, label=k + ' - auc (area = %0.2f)' % auc_value)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.legend(loc="best",bbox_to_anchor=(1, 1))
plt.title('ROC curve for '+ title +' classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
A cummulative function which will perform ploting of metrics and roc curves
def show_metrics(title,clf, yval, y_pred, xval_tfidf):
plot_metrics(title,clf, yval, y_pred, xval_tfidf)
result_lr_df = metrics_df(title, clf, yval, y_pred, xval_tfidf)
plot_roc_curve(title,clf, yval, y_pred, xval_tfidf)
This function will run grid search and return the best classifier parameters, sorted results and predicted y
def grid_search_func(clr, parameters, xtrain_tfidf, ytrain):
clr_cv=GridSearchCV(clr,param_grid=parameters, error_score=0.0)
clr_cv.fit(xtrain_tfidf, ytrain)
results = pd.DataFrame(clr_cv.cv_results_)
sorted_results = results.sort_values(by=['rank_test_score']).head()
clf = clr_cv.best_estimator_
y_pred = clf.predict(xval_tfidf)
return [clf, sorted_results, y_pred]
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
lr = LogisticRegression()
clf_lr = OneVsRestClassifier(lr, n_jobs=-1)
clf_lr.get_params().keys()
parameters = {
'estimator__penalty': ["l1", "l2"],
'estimator__C': [0.5, 1., 5.],
'estimator__dual': [True, False],
"estimator__n_jobs":[-1],
}
clf_lr, best_params, y_pred_lr = grid_search_func(clf_lr, parameters, xtrain_tfidf, ytrain)
best_params
result_lr_df = metrics_df("LogisticRegression",clf_lr, yval, y_pred_lr, xval_tfidf)
result_lr_df
clf_lr.fit(xtrain_tfidf, ytrain)
pickle.dump(clf_lr, open('model/LogisticRegression_classifier.pkl', 'wb'))
show_metrics("LogisticRegression",clf_lr, yval, y_pred_lr, xval_tfidf)
mnb = MultinomialNB(fit_prior=True, class_prior=None)
clf_mnb = OneVsRestClassifier(mnb, n_jobs=-1)
clf_mnb.get_params().keys()
parameters = {
'estimator__alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
'estimator__fit_prior': [True, False],
'n_jobs':[-1]
}
clf_mb, best_params, y_pred_mnb = grid_search_func(clf_mnb, parameters, xtrain_tfidf, ytrain)
best_params
clf_mnb.fit(xtrain_tfidf, ytrain)
pickle.dump(clf_mnb, open('model/MultinomialNB_classifier.pkl', 'wb'))
result_mnb_df = metrics_df("MultinomialNB",clf_mnb, yval, y_pred_mnb, xval_tfidf)
result_mnb_df
show_metrics("MultinomialNB",clf_mnb, yval, y_pred_mnb, xval_tfidf)
lsvc = LinearSVC()
clf_lsvc = OneVsRestClassifier(lsvc, n_jobs=-1)
clf_lsvc.get_params().keys()
parameters = {
'estimator__penalty': ["l1", "l2"],
'estimator__loss': ["hinge", "squared_hinge"],
'estimator__dual': [True, False],
'estimator__tol': [1e-2],
'estimator__C': [ 0.5, 1., 5.],
"n_jobs":[-1]
}
clf_lsvc, best_params, y_pred_lsvc = grid_search_func(clf_lsvc, parameters, xtrain_tfidf, ytrain)
best_params
clf_lsvc.fit(xtrain_tfidf, ytrain)
pickle.dump(clf_lsvc, open('model/LinearSVC_classifier.pkl', 'wb'))
result_lsvc_df = metrics_df("LinearSVC", clf_lsvc,yval, y_pred_lsvc, xval_tfidf)
result_lsvc_df
show_metrics("LinearSVC", clf_lsvc,yval, y_pred_lsvc, xval_tfidf)
from sklearn.tree import DecisionTreeClassifier
clf_dtc = DecisionTreeClassifier(random_state=0)
clf_dtc.get_params().keys()
parameters = {
'criterion': ["gini", "entropy"],
'max_depth': range(8, 10)
}
clf_dtc, best_params, y_pred_dtc = grid_search_func(clf_dtc, parameters, xtrain_tfidf, ytrain)
best_params
clf_dtc.fit(xtrain_tfidf, ytrain)
pickle.dump(clf_lsvc, open('model/DecisionTreeClassifier_classifier.pkl', 'wb'))
result_dtc_df = metrics_df("DecisionTreeClassifier", clf_dtc , yval, y_pred_dtc, xval_tfidf)
result_dtc_df
show_metrics("DecisionTreeClassifier",clf_dtc, yval, y_pred_dtc, xval_tfidf)
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier(random_state=0)
clf_rfc.get_params().keys()
parameters = {
'criterion': ["gini", "entropy"],
'min_samples_split': range(19, 21),
'min_samples_leaf': range(19, 21),
'n_jobs': [-1]
}
clf_rfc, best_params, y_pred_rfc = grid_search_func(clf_rfc, parameters, xtrain_tfidf, ytrain)
best_params
clf_rfc.fit(xtrain_tfidf, ytrain)
pickle.dump(clf_lsvc, open('model/RandomForestClassifier_classifier.pkl', 'wb'))
result_rfc_df = metrics_df("RandomForestClassifier", clf_rfc , yval, y_pred_rfc, xval_tfidf)
result_rfc_df
show_metrics("RandomForestClassifier",clf_rfc, yval, y_pred_rfc, xval_tfidf)
comp = pd.concat([result_lsvc_df, result_mnb_df,result_lr_df, result_rfc_df], ignore_index=True)
comp.head()
comp.sort_values('accuracy_score').drop_duplicates(['genre'],keep='last')
sns.set(rc={'figure.figsize':(18,10)})
sns.set(style="whitegrid")
plt.ylim(.6, 1.2)
s = sns.barplot(x="genre", y="accuracy_score", hue="classifier", data=comp)
s.set_title('Movies genre classification accuracy', size=16)
s.set_xticklabels(list(mainGenres) ,rotation=45, size=15)
text = '''
Tony Stark, who has inherited the defense contractor Stark Industries from his father, is in war-torn Afghanistan with his friend and military liaison, Lieutenant Colonel James Rhodes, to demonstrate the new "Jericho" missile. After the demonstration, the convoy is ambushed and Stark is critically wounded by a missile used by the attackers: one of his company's own. He is captured and imprisoned in a cave by a terrorist group called the Ten Rings. Yinsen, a fellow captive doctor, implants an electromagnet into Stark's chest to keep the shrapnel shards that wounded him from reaching his heart and killing him. Ten Rings leader Raza offers Stark freedom in exchange for building a Jericho missile for the group, but Tony and Yinsen know that Raza will not keep his word.
Stark and Yinsen secretly build a small, powerful electric generator called an arc reactor to power Stark's electromagnet and a prototypical suit of powered armor to aid in their escape. Although they keep the suit hidden almost to completion, the Ten Rings discover their hostages' intentions and attack the workshop. Yinsen sacrifices himself to divert them while the suit is completed. The armored Stark battles his way out of the cave to find the dying Yinsen, then burns the Ten Rings' weapons in anger and flies away, crashing in the desert and destroying the suit. After being rescued by Rhodes, Stark returns home and announces that his company will cease manufacturing weapons. Obadiah Stane, his father's old partner and the company's manager, advises Stark that this may ruin Stark Industries and his father's legacy. In his home workshop, Stark builds a sleeker, more powerful version of his improvised armor suit as well as a more powerful arc reactor for his chest. Personal assistant Pepper Potts places the original reactor inside a small glass showcase. Though Stane requests details, a suspicious Stark decides to keep his work to himself.
At a charity event held by Stark Industries, reporter Christine Everhart informs Stark that his company's weapons were recently delivered to the Ten Rings and are being used to attack Yinsen's home village, Gulmira. Stark dons his new armor and flies to Afghanistan, where he saves the villagers. While flying home, Stark is attacked by two fighter jets. He reveals his secret identity to Rhodes over the phone in an attempt to end the attack. Meanwhile, the Ten Rings gather the pieces of Stark's prototype suit and meet with Stane, who has been trafficking arms to criminals worldwide and has staged a coup to replace Stark as Stark Industries' CEO by hiring the Ten Rings to kill him. He subdues Raza and has the rest of the group killed. Stane has a massive new suit reverse engineered from the wreckage. Seeking to track his company's illegal shipments, Stark sends Potts to hack into its database. She discovers that Stane hired the Ten Rings to kill Stark, but the group reneged. Potts meets with Agent Phil Coulson of S.H.I.E.L.D., an intelligence agency, to inform him of Stane's activities.
Stane's scientists cannot duplicate Stark's miniaturized arc reactor, so Stane ambushes Stark at his home and takes the one from his chest. Stark manages to get to his original reactor to replace it. Potts and several S.H.I.E.L.D. agents attempt to arrest Stane, but he dons his suit and attacks them. Stark fights Stane but is outmatched without his new reactor to run his suit at full capacity. The fight carries Stark and Stane to the top of the Stark Industries building, and Stark instructs Potts to overload the large arc reactor powering the building. This unleashes a massive electrical surge that causes Stane and his armor to fall into the exploding reactor, killing him. The next day, at a press conference, Stark defies suggestions from S.H.I.E.L.D. and publicly admits to being "Iron Man."
In a post-credits scene, S.H.I.E.L.D. Director Nick Fury visits Stark at home, telling him that Iron Man is not "the only superhero in the world", and explaining that he wants to discuss the "Avenger Initiative".
'''
test = multilabel_binarizer.inverse_transform(y_pred_lr)[0] + multilabel_binarizer.inverse_transform(y_pred_lsvc)[0] + multilabel_binarizer.inverse_transform(y_pred_mnb)[0]
print(set(test))
A word embedding is a class of approaches for representing words and documents using a dense vector representation.
It is an improvement over more the traditional bag-of-word model encoding schemes where large sparse vectors were used to represent each word or to score each word within a vector to represent an entire vocabulary. These representations were sparse because the vocabularies were vast and a given word or document would be represented by a large vector comprised mostly of zero values.
Instead, in an embedding, words are represented by dense vectors where a vector represents the projection of the word into a continuous vector space.
The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used.
The position of a word in the learned vector space is referred to as its embedding.
Two popular examples of methods of learning word embeddings from text include:
In addition to these carefully designed methods, a word embedding can be learned as part of a deep learning model. This can be a slower approach, but tailors the model to a specific training dataset.
</section>Keras offers an Embedding layer that can be used for neural networks on text data.
It requires that the input data be integer encoded, so that each word is represented by a unique integer. This data preparation step can be performed using the Tokenizer API also provided with Keras.
The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.
It is a flexible layer that can be used in a variety of ways, such as:
The Embedding layer is defined as the first hidden layer of a network. It must specify 3 arguments:
It must specify 3 arguments:
For example, below we define an Embedding layer with a vocabulary of 200 (e.g. integer encoded words from 0 to 199, inclusive), a vector space of 32 dimensions in which words will be embedded, and input documents that have 50 words each.
|
1 |
e = Embedding(200, 32, input_length=50) |
The Embedding layer has weights that are learned. If you save your model to file, this will include weights for the Embedding layer.
The output of the Embedding layer is a 2D vector with one embedding for each word in the input sequence of words (input document).
If you wish to connect a Dense layer directly to an Embedding layer, you must first flatten the 2D output matrix to a 1D vector using the Flatten layer.
Plots the model accuracy and training & validation loss values graphs
def plot_model_acc(history):
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
docs = movies['Plot_correction']
labels = y
vocab_size = 1000
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# Spliting data into into train and test
(X_train, X_test, Y_train, Y_test) = train_test_split(padded_docs, labels, test_size=0.2, random_state=1)
# Spliting data from train to train and validate sets
(X_train, X_val, y_train, y_val) = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
vocab_size = 1000
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model1 = Sequential()
model1.add(Embedding(vocab_size, 8, input_length=max_length))
model1.add(Flatten())
model1.add(Dense(20, activation='sigmoid'))
# compile the model
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model1.summary())
# fit the model
history1 = model1.fit(X_train, y_train,validation_data=(X_val, y_val), epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model1.evaluate(X_val, y_val, verbose=0)
print('Accuracy: %f' % (accuracy*100))
plot_model_acc(history1)
pickle.dump(model1, open('model/NueralNetworkClassifier_OneHot_classifier.pkl', 'wb'))
y_pred_keras_one_hot = model1.predict(X_test)
y_pred_keras_one_hot = y_pred_keras_one_hot.round(decimals=0, out=None)
result_keras_one_hot_df = metrics_df("NueralNetworkClassifier_OneHot", model1 , yval, y_pred_keras_one_hot, X_test)
result_keras_one_hot_df
show_metrics("NueralNetworkClassifier_OneHot",model1, yval, y_pred_keras_one_hot, X_test)
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
# summarize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='tfidf')
print(encoded_docs)
vocab_size = 1000
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# Spliting data into into train and test
(X_train, X_test, Y_train, Y_test) = train_test_split(padded_docs, labels, test_size=0.2, random_state=1)
# Spliting data from train to train and validate sets
(X_train, X_val, y_train, y_val) = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
vocab_size = 1000
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model2 = Sequential()
model2.add(Embedding(vocab_size, 8, input_length=max_length))
model2.add(Flatten())
model2.add(Dense(20, activation='sigmoid'))
# compile the model
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model2.summary())
# fit the model
history2 = model2.fit(X_train, y_train,validation_data=(X_val, y_val), epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model2.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))
plot_model_acc(history2)
pickle.dump(model2, open('model/NueralNetworkClassifier_TDIF_classifier.pkl', 'wb'))
y_pred_keras_tdif = model2.predict(X_test)
y_pred_keras_tdif = y_pred_keras_tdif.round(decimals=0, out=None)
result_keras_tdif_df = metrics_df("NueralNetworkClassifier_TDIF", model2 , yval, y_pred_keras_tdif, X_test)
result_keras_tdif_df
show_metrics("NueralNetworkClassifier_TDIF",model2, yval, y_pred_keras_tdif, X_test)
test_text = t.texts_to_matrix(text, mode='tfidf')
max_length = 100
test_text = pad_sequences(test_text, maxlen=max_length, padding='post')
y_pred_keras = model2.predict(test_text)
docs = movies
docs['Plot_correction'] = docs['Plot_correction'].apply(lambda x : str(x.encode(encoding='UTF-8',errors='strict').decode("utf-8") ))
# docs.head()
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs['Plot_correction'].values)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs['Plot_correction'].values)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove_6B/glove.6B.300d.txt', encoding="utf-8")
for line in f:
values = line.split()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Spliting data into into train and test
(X_train, X_test, Y_train, Y_test) = train_test_split(padded_docs, labels, test_size=0.2, random_state=1)
# Spliting data from train to train and validate sets
(X_train, X_val, y_train, y_val) = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
# define model
model3 = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
model3.add(e)
model3.add(Flatten())
model3.add(Dense(20, activation='sigmoid'))
# compile the model
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model3.summary())
# fit the model
history3 = model3.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, verbose=0)
# evaluate the model
loss, accuracy = model3.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))
plot_model_acc(history3)
pickle.dump(model3, open('model/NueralNetworkClassifier_extended_classifier.pkl', 'wb'))
y_pred_keras_extended = model3.predict(X_test)
y_pred_keras_extended = y_pred_keras_extended.round(decimals=0, out=None)
result_keras_extend_df = metrics_df("NueralNetworkClassifier_extended", model3 , yval, y_pred_keras_extended, X_test)
result_keras_extend_df
show_metrics("NueralNetworkClassifier_extended",model3, yval, y_pred_keras_extended, X_test)
comp = pd.concat([result_keras_one_hot_df, result_keras_tdif_df, result_keras_extend_df], ignore_index=True)
comp.sort_values('fscore').drop_duplicates(['genre'],keep='last')
sns.set(rc={'figure.figsize':(18,10)})
sns.set(style="whitegrid")
plt.ylim(.6, 1.1)
s = sns.barplot(x="genre", y="accuracy_score", hue="classifier", data=comp) #.set_title('Movies genre classification accuracy (multinomialNB VS LinearSVC)')
s.set_title('Movies genre classification accuracy', size=16)
s.set_xticklabels(list(mainGenres) ,rotation=45, size=15)
comp = pd.concat([result_lsvc_df, result_mnb_df,result_lr_df, result_dtc_df, result_rfc_df, result_keras_one_hot_df, result_keras_tdif_df, result_keras_extend_df], ignore_index=True)
comp.sort_values('fscore').drop_duplicates(['genre'],keep='last')
comp = pd.concat([result_lsvc_df, result_mnb_df,result_lr_df, result_dtc_df, result_rfc_df, result_keras_one_hot_df, result_keras_tdif_df, result_keras_extend_df], ignore_index=True)
comp.sort_values('fscore').drop_duplicates(['genre'],keep='last')
sns.set(rc={'figure.figsize':(18,10)})
sns.set(style="whitegrid")
plt.ylim(.6, 1.2)
s = sns.barplot(x="genre", y="accuracy_score", hue="classifier", data=comp) #.set_title('Movies genre classification accuracy (multinomialNB VS LinearSVC)')
s.set_title('Movies genre classification accuracy', size=16)
s.set_xticklabels(list(mainGenres) ,rotation=45, size=15)
comp.to_csv('classifier_result.csv')
From the above we can see that LogisticRegression, MultinomialNB, RandomForestClassifier, NueralNetworkClassifier_OneHot and DecisionTreeClassifier were been able to have higher accuracies in many genre
Random Forest seems to have an over better results than most of the classifiers
All the results of the model comparison are store into classifier_result.csv